import pandas as pd
import numpy as np
from datetime import datetime

c = 1000

# Random sample of sentences
from random import sample, seed
seed(123)

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))

examples = sample(sorted(set(df['sentence_raw'])), 20)
temp = df[df['sentence_raw'].isin(examples)]

with open('../tables/Section_E_1.tex', 'w') as f:
    for sentence in examples:
        f.write('\n')
        f.write(sentence + '\n')
        f.write('\n')
        
        temp = df[df['sentence_raw'] == sentence]
        
        for narr in list(sorted(set(temp.narrative))):
            f.write(narr + '\n')
            f.write('\n')
            
        f.write('\\dotfill')
        f.write('\n')
    
del df

# Import date field as date type
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c), parse_dates=['date'], date_parser=dateparse)

df['year'] = df['date'].dt.year

examples = df[['narrative', 'frequency']].drop_duplicates()
examples = examples.sort_values('frequency', ascending = False)['narrative'].head(20)

with open('../tables/Section_E_2.tex', 'w') as f:
    for narr in examples:
        f.write('\n')
        f.write(narr + '\n')
        f.write('\n')
        
        temp = df[df['narrative'] == narr]
        
        sentences = sample(sorted(set(temp['sentence_raw'])), 5)
        
        for sent in sentences:
            f.write(sent + '\n')
            f.write('\n')
            
        f.write('\\dotfill')
        f.write('\n')

# Make table of topics / n-grams / narratives
pd.set_option('display.max_colwidth', None)

topics = pd.read_csv('../data/metadata/topics_labels_1000.csv')

list_of_topics_in_df = list(df)
list_of_topics_in_df = [item.replace('topic_', '') for item in list_of_topics_in_df if 'topic' in item]

t = []

for topic in list_of_topics_in_df:
    temp = df[['narrative', 'topic_%s'%topic]]
    temp['topic_{0}_mean'.format(topic)] = temp['topic_%s'%topic].groupby(df['narrative']).transform('mean')
    temp = temp[['narrative', 'topic_%s_mean'%topic]]
    temp = list(temp.drop_duplicates().sort_values('topic_%s_mean'%topic, ascending = False)['narrative'].head(10))
    top_tokens =  list(topics[topics['topic_label'] ==  ' '.join(topic.split('_'))]['words'])
    top_tokens = top_tokens[0].split(', ')[0:10]
    t.append([topic, ', '.join(top_tokens), ', '.join(temp)])

t = pd.DataFrame(t, columns=['Topic', 'Top Words', 'Top Narratives'])
t.to_latex('../tables/Table_F_1_2_3.tex'.format(c), index=False)

# Plot top narratives for 16 most frequent entities in the corpus.
K = 10
N = 20

temp1 = df['ARGO'].value_counts().reset_index()
temp2 = df['ARG1'].value_counts().reset_index()
temp3 = temp1.merge(temp2, on = 'index', how = 'outer')
temp3 = temp3.fillna(0)
temp3['count'] = temp3['ARGO'] + temp3['ARG1']
temp3 = temp3.sort_values(by = 'count', ascending = False).iloc[0:N]

with open('../tables/Table_G_1.tex', 'w') as f:
                    
    f.write('\\begin{table} \n')
    f.write('\\centering \n')
    f.write('\\scriptsize \n')
        
    i = 0
        
    for entity in set(temp3['index']):
        
        i = i+1
        
        temp = df[(df.ARGO== entity) | (df.ARG1 == entity)]
        
        f.write('\\begin{tabularx}{0.24\\textwidth}{l} \n')
        
        f.write('\\hline \n')
        f.write(entity + '\\\\ \n')
        f.write('\\hline \n')
        
        temp = temp.drop_duplicates(subset = "narrative")
        temp = temp.sort_values(by = 'frequency', ascending = False).iloc[0:K]    
        for narr in list(temp.narrative):
             f.write(narr + '\\\\ \n')
                    
        f.write('\\hline \n')
        f.write('\\end{tabularx} \n')
        
        if i == 4:
            f.write('\n')
            f.write('\\bigskip \n')
            f.write('\n')
            i = 0
     
    f.write('\n')
    f.write('\\caption{Most Frequent Narratives Per Entity} \n')
    f.write('\\label{tab: frequent_agent_narratives} \n')
    f.write('\\end{table} \n')


# Get most frequent narratives
df.drop_duplicates(subset=['narrative']).sort_values(by=['frequency'],ascending=False)[['narrative', 'frequency']].head(n=20).to_latex('../tables/Table_1.tex', index=False)

# Terms labeled as procedural 
labels = pd.read_csv('../data/metadata/gpo_manual_cluster_labels/all_clusters_manually_labeled_{0}.csv'.format(c))
noise_entities = list(labels[labels['cluster_label_manual'] == 'noise'].ARG)

with open('../tables/Section_L.tex', 'w') as f:
     for entity in noise_entities:
         f.write(entity + ', ')

